/*-----------------------------------------------------------------------------------------------------------------------------------------
File:			Clean WW1 Casualties

Project:		WWI Discrimination (Ferrara & Fishback)	
Last change:	22/JUN/2017 (AF)
Version:		1.1

Change log:		

Purpose:		This do file geolocates and cleans the WWI casualties.
				Part 1-A sets up town/city/place names from geonames.org
					 1-B sets up town/city/places names from NHGIS
					 1-C brings Navy casualty lists into the Army list format
				Part 2   cleans location info of soldiers and assigns county & state FIPS codes
				Part 3	 cleans the names of soldiers (in case we want to link them to the Census)
-----------------------------------------------------------------------------------------------------------------------------------------*/

clear all
capture set more off

* Stata routines needing to be installed prior to running this code
ssc install strrec
ssc install freqindex


*******************************************************************************************************************************************
// 1-A) Geonames org place list
*******************************************************************************************************************************************

	use "$path/Replication/raw_data/geonamesorg_places_1910.dta"
	
	replace name = upper(name)
	
	replace name = subinstr(name, "ST.", "SAINT", .)
	replace name = subinstr(name, "NORTH ", "", .)
	replace name = subinstr(name, "SOUTH ", "", .)
	replace name = subinstr(name, "WEST ", "", .)
	replace name = subinstr(name, "EAST ", "", .)
	replace name = subinstr(name, "(HISTORICAL)", "", .)
	
	replace name = strtrim(name)
	replace name = stritrim(name)
	
	* drop Alaska and Hawaii
	drop if state=="AK" | state=="HI"
	
	* save town/city/place lists for each state as a separate temp file
	qui levelsof state, local(statelvl)
	
	qui foreach s of local statelvl {
		preserve
		keep if state=="`s'"
	
		tempfile citynames_`s'
		save `citynames_`s''
		restore
	}
	
	clear


*******************************************************************************************************************************************
// 1-B) NHGIS 1910 place list
*******************************************************************************************************************************************

	use "$path/Replication/raw_data/placenames1910.dta"
	
	replace city = subinstr(city, "ST.", "SAINT", .)
	replace city = subinstr(city, "NORTH ", "", .)
	replace city = subinstr(city, "SOUTH ", "", .)
	replace city = subinstr(city, "WEST ", "", .)
	replace city = subinstr(city, "EAST ", "", .)
	
	replace city = strtrim(city)
	replace city = stritrim(city)
	
	* save town/city/place lists for each state as a separate temp file
	qui levelsof STATE, local(statelvl2)
	
	qui foreach s of local statelvl2 {
		preserve
		keep if STATE=="`s'"
	
		tempfile NHGIScitynames_`s'
		save `NHGIScitynames_`s''
		restore
	}
	
	clear

	
*******************************************************************************************************************************************
// 1-C) Clean Navy Casualties
*******************************************************************************************************************************************

	clear
	use "$path/Replication/raw_data/Deaths in the US Navy from April 1917 to November 1918.dta"

	* clean string variable
	replace var1 = upper(var1)

	foreach c in 33 34 35 38 46 47 58 59 62 63 124 125 128 147 153 226 {
		replace var1 = subinstr(var1, `"`=char(`c')'"', "", .)
	}

	replace var1 = subinstr(var1, ", JR", " JR", .)
	replace var1 = subinstr(var1, ", SR", " SR", .)

	replace var1 = subinstr(var1, ", 1ST", " 1ST", .)
	replace var1 = subinstr(var1, ", 2ND", " 2ND", .)
	replace var1 = subinstr(var1, ", 3RD", " 3RD", .)
	replace var1 = subinstr(var1, ", 4TH", " 4TH", .)

	replace var1 = subinstr(var1, ", USNR", " USNR", .)
	replace var1 = subinstr(var1, ", USCG", " USCG", .)
	replace var1 = subinstr(var1, ", US ", " US ", .)
	replace var1 = subinstr(var1, ", USS ", " USS", .)
	replace var1 = subinstr(var1, ", UNITED", " UNITED", .)

	replace var1 = subinstr(var1, "(CLASS", "[CLASS", .)
	replace var1 = subinstr(var1, "(OTHER", "[OTHER", .)
	replace var1 = subinstr(var1, "(LATER", "[LATER", .)

	replace var1 = subinstr(var1, "(CHIEF)", "[CHIEF]", .)
	replace var1 = subinstr(var1, "(FEMALE)", "[FEMALE]", .)
	replace var1 = subinstr(var1, "(RESERVE)", "[RESERVE]", .)
	replace var1 = subinstr(var1, "(RIGHT)", "[RIGHT]", .)
	replace var1 = subinstr(var1, "(INACTIVE)", "[INACTIVE]", .)
	replace var1 = subinstr(var1, "(RADIO)", "[RADIO]", .)
	replace var1 = subinstr(var1, "(?)", "[?]", .)
	replace var1 = subinstr(var1, "(ACTING)", "[ACTING]", .)
	replace var1 = subinstr(var1, "(GCMP)", "[GCMP]", .)
	replace var1 = subinstr(var1, "(MB)", "[MB]", .)
	replace var1 = subinstr(var1, "(S)", "[S]", .)
	replace var1 = subinstr(var1, "(SIGNALMAN)", "[SIGNALMAN]", .)
	replace var1 = subinstr(var1, "(RETIRED)", "[RETIRED]", .)
	replace var1 = subinstr(var1, "(SUPPLY CORPS)", "[SUPPLY CORPS]", .)
	replace var1 = subinstr(var1, "(MEDICAL CORPS)", "[MEDICAL CORPS]", .)
	replace var1 = subinstr(var1, "(GENERAL COURTMARTIAL PRISONER)", "[GENERAL COURTMARTIAL PRISONER]", .)
	replace var1 = subinstr(var1, "(GENERAL COURT-MARTIAL PRISONER)", "[GENERAL COURT-MARTIAL PRISONER]", .)
	replace var1 = subinstr(var1, "(NOT IN ACTIVE SERVICE)", "[NOT IN ACTIVE SERVICE]", .)
	replace var1 = subinstr(var1, "(NOTE FATHER'S NAME)", "[NOTE FATHER'S NAME]", .)
	replace var1 = subinstr(var1, "(G C M P)","[G C M P]", .)
	replace var1 = subinstr(var1, "(TEMPORARY)","[TEMPORARY]", .)
	replace var1 = subinstr(var1, "(PAY CORPS)", "[PAY CORPS]", .)
	replace var1 = subinstr(var1, "(CIVIL ENGINEER CORPS)" ,"[CIVIL ENGINEER CORPS]", .)
	replace var1 = subinstr(var1, "(2ND CLASS)", "[2ND CLASS]", .)

	replace var1 = subinstr(var1, "(A)",  "[A]", .)
	replace var1 = subinstr(var1, "(R)",  "[B]", .)
	replace var1 = subinstr(var1, "(G)",  "[G]", .)
	replace var1 = subinstr(var1, "(JG)", "[JG]", .)
	replace var1 = subinstr(var1, "(E)",  "[E]", .)
	replace var1 = subinstr(var1, "(F)",  "[F]", .)
	replace var1 = subinstr(var1, "(SC)", "[SC]", .)
	replace var1 = subinstr(var1, "(T)",  "[T]", .)

	replace var1 = strtrim(var1)
	replace var1 = stritrim(var1)

	split var1, p("(")

	replace var12 = var12 + " " + var13 + " " + var14 + " " + var15
	replace var12 = strtrim(var12)

	drop var13- var15

	split var12, p("ENLISTED" "APPOINTED" ")") limit(2)

	split var122, p(",") limit(2) gen(place)

	replace place2 = strtrim(place2)

	replace place1 = "" if place2==""

	replace place2 = "NY" if strpos(place2, "N Y")>0
	replace place2 = "IL" if place2=="ILL"
	replace place2 = "MA" if place2=="MASS" | place2=="BOSTON  MASS"
	replace place2 = "CA" if place2=="CALIF"
	replace place2 = "TX" if place2=="TEX"
	replace place2 = "MI" if place2=="MICH"
	replace place2 = "OH" if place2=="OHIO"
	replace place2 = "MO" if place2=="M'O"
	replace place2 = "MN" if place2=="MINN"
	replace place2 = "WI" if place2=="WIS"
	replace place2 = "IN" if place2=="IND"
	replace place2 = "RI" if place2=="R I"
	replace place2 = "IA" if place2=="IOWA"
	replace place2 = "CT" if place2=="CONN"
	replace place2 = "NE" if place2=="NEBR"
	replace place2 = "TN" if place2=="TENN"
	replace place2 = "OR" if place2=="OREG" | place2=="ORE" | place2=="OREGON"
	replace place2 = "NJ" if place2=="N J"
	replace place2 = "DC" if place2=="D C"
	replace place2 = "SC" if place2=="S C"
	replace place2 = "NC" if place2=="N C"
	replace place2 = "WA" if place2=="WASH" | place2=="'WASH"
	replace place2 = "AL" if place2=="ALA"
	replace place2 = "CO" if place2=="COLO"
	replace place2 = "MS" if place2=="MISS"
	replace place2 = "UT" if place2=="UTAH"
	replace place2 = "OK" if place2=="OKLA"
	replace place2 = "AR" if place2=="ARK"
	replace place2 = "FL" if place2=="FLA"
	replace place2 = "WV" if place2=="W VA"
	replace place2 = "NH" if place2=="N H"
	replace place2 = "NY" if place2=="NEW YORK"
	replace place2 = "PA" if place2=="PHILADELPHIA" | place2=="PENNSYLVANIA" | place2=="PA -" | place2=="PA-"
	replace place2 = "MA" if place2=="BOSTON"
	replace place2 = "NY" if place2=="LONG ISLAND"
	replace place2 = "MT" if place2=="MONT"
	replace place2 = "MT" if place2=="NEV"
	replace place2 = "ID" if place2=="IDAHO"
	replace place2 = "KS" if place2=="KANS"
	replace place2 = "LA" if place2=="LOUISIANA"
	replace place2 = "MD" if place2=="MD '"

	replace place2 = "" if place2!="NY" & place2!="IL" & place2!="MA" & place2!="CA" & place2!="TX" & place2!="MI" & place2!="OH" & place2!="MN" & ///
						   place2!="WI" & place2!="IN" & place2!="RI" & place2!="IA" & place2!="CT" & place2!="NE" & place2!="TN" & place2!="OR" & ///
						   place2!="NJ" & place2!="DC" & place2!="SC" & place2!="NC" & place2!="WA" & place2!="AL" & place2!="CO" & place2!="MS" & ///
						   place2!="UT" & place2!="OK" & place2!="AR" & place2!="FL" & place2!="WV" & place2!="NH" & place2!="PA" & place2!="MA" & ///
						   place2!="MT" & place2!="MO" & place2!="NV" & place2!="ID" & place2!="KS" & place2!="LA" & place2!="KY" & place2!="GA" & ///
						   place2!="VA" & place2!="MD" & place2!="ME" & place2!="VT" & place2!="AZ" & place2!="DE" & place2!="NM" & place2!="WY" & ///
						   place2!="SD" & place2!="ND"

	gen word1 = word(var121, -1) if place2==""
	gen word2 = word(var121, -2) if place2==""
	gen word3 = word(var121, -3) if place2==""

	replace place2 = word2+word1 if length(word1)==1 & length(word2)==1 & place2==""

	replace place2 = word1 if length(word1)==2 & place2==""

	replace place2 = "MA" if word1=="MASS" & place2==""
	replace place2 = "FL" if word1=="FLA" & place2==""
	replace place2 = "CA" if word1=="CALIF" & place2==""
	replace place2 = "IL" if word1=="ILL" & place2==""
	replace place2 = "TX" if word1=="TEX" & place2==""
	replace place2 = "OH" if word1=="OHIO" & place2==""
	replace place2 = "MI" if word1=="MICH" & place2==""
	replace place2 = "TN" if word1=="TENN" & place2==""
	replace place2 = "IN" if word1=="IND" & place2==""
	replace place2 = "CT" if word1=="CONN" & place2==""
	replace place2 = "AL" if word1=="ALA" & place2==""
	replace place2 = "MS" if word1=="MISS" & place2==""
	replace place2 = "WI" if word1=="WIS" & place2==""
	replace place2 = "OR" if word1=="OREG" & place2==""
	replace place2 = "WA" if word1=="WASH" & place2==""
	replace place2 = "IA" if word1=="IOWA" & place2==""
	replace place2 = "KS" if word1=="KANS" & place2==""
	replace place2 = "CO" if word1=="COLO" & place2==""
	replace place2 = "DE" if word1=="DEL" & place2==""
	replace place2 = "MN" if word1=="MINN" & place2==""
	replace place2 = "OK" if word1=="OKLA" & place2==""
	replace place2 = "CA" if word1=="CAL" & place2==""
	replace place2 = "NE" if word1=="NEBR" & place2==""
	replace place2 = "PA" if word1=="PENN" & place2==""
	replace place2 = "WY" if word1=="WYO" & place2==""
	replace place2 = "AZ" if word1=="ARIZ" & place2==""
	replace place2 = "ID" if word1=="IDAHO" & place2==""
	replace place2 = "KY" if word1=="KENTUCKY" & place2==""

	drop word*

	gen lastword = word(var122, -1)

	replace place2 = "NY" if (lastword=="YORK" | lastword=="Y") & place2==""
	replace place2 = "MA" if (lastword=="MASSACHUSETTS" | lastword=="MASS") & place2==""
	replace place2 = "PA" if (lastword=="PENNSYLVANIA" | lastword=="PA") & place2==""
	replace place2 = "NJ" if (lastword=="JERSEY" | lastword=="J") & place2==""
	replace place2 = "CA" if (lastword=="CALIFORNIA" | lastword=="CALIF") & place2==""
	replace place2 = "OH" if lastword=="OHIO" & place2==""
	replace place2 = "DC" if (lastword=="C" | lastword=="COLUMBIA") & place2==""
	replace place2 = "VA" if (lastword=="VIRGINIA" | lastword=="VA") & place2==""
	replace place2 = "IL" if (lastword=="ILLINOIS" | lastword=="ILL") & place2==""
	replace place2 = "MI" if (lastword=="MICHIGAN" | lastword=="MICH") & place2==""
	replace place2 = "MD" if (lastword=="MARYLAND" | lastword=="MD") & place2==""
	replace place2 = "AL" if (lastword=="ALA" | lastword=="ALABAMA") & place2==""
	replace place2 = "CT" if (lastword=="CONNECTICUT" | lastword=="CONN") & place2==""
	replace place2 = "TN" if lastword=="TENN" & place2==""
	replace place2 = "TX" if (lastword=="TEXAS" | lastword=="TEX") & place2==""
	replace place2 = "LA" if (lastword=="LOUISIANA" | lastword=="LA") & place2==""
	replace place2 = "IN" if lastword=="INDIANA" & place2==""
	replace place2 = "ME" if (lastword=="MAINE" | lastword=="ME") & place2==""
	replace place2 = "MN" if lastword=="MINNESOTA" & place2==""
	replace place2 = "NE" if lastword=="NEBR" & place2==""
	replace place2 = "WI" if (lastword=="WISCONSIN" | lastword=="WIS") & place2==""
	replace place2 = "CO" if lastword=="COLORADO" & place2==""
	replace place2 = "DE" if lastword=="DELAWARE" & place2==""
	replace place2 = "NH" if (lastword=="HAMPSHIRE" | lastword=="H") & place2==""
	replace place2 = "IA" if lastword=="IOWA" & place2==""
	replace place2 = "MO" if lastword=="MISSOURI" & place2==""
	replace place2 = "MS" if lastword=="MISS" & place2==""
	replace place2 = "OR" if lastword=="OREGON" & place2==""
	replace place2 = "TN" if lastword=="TENNESSEE" & place2==""
	replace place2 = "GA" if (lastword=="GA" | lastword=="GEORGIA") & place2==""
	replace place2 = "FL" if (lastword=="FLORIDA" | lastword=="FLA") & place2==""
	replace place2 = "KY" if (lastword=="KENTUCKY" | lastword=="KY") & place2==""
	replace place2 = "WA" if (lastword=="WA" | lastword=="WASH") & place2==""

	drop lastword

	count if place2==""

	gen tag = 0
	foreach s in AL AZ AR CA CO CT DE DC FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY {
		replace tag = 1 if place2=="`s'"
	}
	replace place2 = "" if tag==0

	split var1 if place2=="", p("ENLISTED") gen(temp)
	gen lastword = word(temp2, 2)

	replace lastword = subinstr(lastword, ",", "", .)
	replace lastword = strtrim(lastword)

	replace place2 = "NY" if (lastword=="YORK" | lastword=="Y") & place2==""
	replace place2 = "MA" if (lastword=="MASSACHUSETTS" | lastword=="MASS") & place2==""
	replace place2 = "PA" if (lastword=="PENNSYLVANIA" | lastword=="PA") & place2==""
	replace place2 = "NJ" if (lastword=="JERSEY" | lastword=="J") & place2==""
	replace place2 = "CA" if (lastword=="CALIFORNIA" | lastword=="CALIF") & place2==""
	replace place2 = "OH" if lastword=="OHIO" & place2==""
	replace place2 = "DC" if (lastword=="C" | lastword=="COLUMBIA") & place2==""
	replace place2 = "VA" if (lastword=="VIRGINIA" | lastword=="VA") & place2==""
	replace place2 = "IL" if (lastword=="ILLINOIS" | lastword=="ILL") & place2==""
	replace place2 = "MI" if (lastword=="MICHIGAN" | lastword=="MICH") & place2==""
	replace place2 = "MD" if (lastword=="MARYLAND" | lastword=="MD") & place2==""
	replace place2 = "AL" if (lastword=="ALA" | lastword=="ALABAMA") & place2==""
	replace place2 = "CT" if (lastword=="CONNECTICUT" | lastword=="CONN") & place2==""
	replace place2 = "TN" if lastword=="TENN" & place2==""
	replace place2 = "TX" if (lastword=="TEXAS" | lastword=="TEX") & place2==""
	replace place2 = "LA" if (lastword=="LOUISIANA" | lastword=="LA") & place2==""
	replace place2 = "IN" if (lastword=="IND" | lastword=="INDIANA") & place2==""
	replace place2 = "ME" if (lastword=="MAINE" | lastword=="ME") & place2==""
	replace place2 = "MN" if (lastword=="MINN" | lastword=="MINNESOTA") & place2==""
	replace place2 = "NE" if lastword=="NEBR" & place2==""
	replace place2 = "WI" if (lastword=="WISCONSIN" | lastword=="WIS") & place2==""
	replace place2 = "CO" if lastword=="COLORADO" & place2==""
	replace place2 = "DE" if lastword=="DELAWARE" & place2==""
	replace place2 = "NH" if (lastword=="HAMPSHIRE" | lastword=="H") & place2==""
	replace place2 = "IA" if lastword=="IOWA" & place2==""
	replace place2 = "MO" if lastword=="MISSOURI" & place2==""
	replace place2 = "MS" if lastword=="MISS" & place2==""
	replace place2 = "OR" if lastword=="OREGON" & place2==""
	replace place2 = "TN" if lastword=="TENNESSEE" & place2==""
	replace place2 = "GA" if (lastword=="GA" | lastword=="GEORGIA") & place2==""
	replace place2 = "FL" if (lastword=="FLORIDA" | lastword=="FLA") & place2==""
	replace place2 = "KY" if (lastword=="KENTUCKY" | lastword=="KY") & place2==""
	replace place2 = "WA" if (lastword=="WA" | lastword=="WASH") & place2==""
	replace place2 = "VT" if lastword=="VT" & place2==""

	ren place2 state

	drop temp* tag
	drop if state==""

	replace place1 = "" if strpos(place1, "USS")>0
	replace place1 = "" if strpos(place1, "USCG")>0
	replace place1 = "" if strpos(place1, "SECTION")>0
	replace place1 = "" if strpos(place1, "SHIP")>0
	replace place1 = "" if strpos(place1, "TRAINING")>0
	replace place1 = "" if strpos(place1, "SUBMARINE")>0
	replace place1 = "" if strpos(place1, "NAVY")>0
	replace place1 = "" if strpos(place1, "RECALLED")>0
	replace place1 = "" if strpos(place1, "SHIP")>0
	replace place1 = "" if strpos(place1, "U S S")>0
	replace place1 = "" if strpos(place1, "NAVAL")>0
	replace place1 = "" if strpos(place1, "RECRUITING")>0
	replace place1 = "" if strpos(place1, "AGRICULTURAL")>0

	replace var121 = subinstr(var121, "`=char(32)'", "", .)
	replace var121 = subinstr(var121, ",", " ", .)
	replace var121 = subinstr(var121, "STREET", " ", .)
	gen place1a = word(var121, -2)

	replace place1 = place1a if place1==""

	replace place1 = "NEW YORK" if place1=="NEWYORK"

	ren place1 city
	drop place1a lastword

	split var11, p(",")
	drop var114 var115

	ren (var111 var112 var113) (surname name rank)

	* fix missing commas
	replace rank = name if rank=="" & wordcount(surname)>1

	gen tag = rank==""
	replace rank = name if tag==1
	replace name = word(rank, 1) if name==rank & tag==1

	replace name = word(surname, 2) + " " + word(surname, 3) if name==rank

	replace surname = strtrim(regexr(surname, name, ""))

	split surname, p(" OR ") limit(1)
	replace surname = surname1

	drop tag surname1

	gen status = ""
	replace status = "KIA"
	replace status = "Disease" 	if strpos(var12, "DISEASE")>0 	| ///
								   strpos(var12, "INFLUENZA")>0	| ///
								   strpos(var12, "POISON")>0

	replace status = "Accident" if strpos(var12, "ACCIDENT")>0	| ///
								   strpos(var12, "COLLISION")>0 | ///
								   strpos(var12, "OVERBOARD")>0	| ///
								   strpos(var12, "RAMMED")>0	| ///
								   strpos(var12, "DROWNED")>0

	replace status = "Wounds" 	if strpos(var12, "DOI")>0		| ///
								   strpos(var12, "DOW")>0		| ///
								   strpos(var12, "WOUNDS")>0	| ///
								   strpos(var12, "INJUR")>0

	gen volume = "Navy"

	keep city state surname name rank status volume

	replace volume = "4"
	destring volume, replace

	sort surname name state
	gen serial = _n

	compress
	save "$path/Replication/cleaned_data/WWI Navy Casualties clean.dta", replace
	clear
	
	
*******************************************************************************************************************************************
// 2) Clean location info in the WWI Casualty Lists (Army + Navy)
*******************************************************************************************************************************************

	
	// WWI Army Casualty Records
	import excel "$path/Replication/raw_data/WW1 casualty list.xlsx", sheet("Sheet1") firstrow
	
	// Append WWI Navy Casualty Records
	append using "$path/Replication/cleaned_data/WWI Navy Casualties clean.dta"
	
	
	// Label volume variable (Vol 1-3 Army, Vol 4 Navy even though Navy isn't actually "Vol 4" but a different source)
	label def volnum 1 "Army Vol 1" 2 "Army Vol 2" 3 "Army Vol 3" 4 "Navy"
	label val volume volnum
	
	
	replace city = upper(city)
	
	* get rid of special characters in the city name variable
	qui foreach v in 33 34 39 40 42 46 47 49 51 52 54 58 59 62 63 91 93 94 95 126 128 129 131 136 	///
				141 142 148 154 155 157 160 162 164 167 168 171 174 176 177 179 180 181 182 186	///
				187 188 191 194 195 208 209 226 {
				
				replace city = subinstr(city, `"`=char(`v')'"', "", .)
	}
	
	replace city = strtrim(city)
	replace city = stritrim(city)

	* temp save of data
	tempfile soldiers
	save `soldiers'
	
	
	// generate a list of unique city/town/place names by state, assign FIPS codes and then merge the list back to the casualties
	gen n = 1
	
	collapse (sum) n, by(state city)
	
	* statenames
	qui strrec state 	("AL" = "ALABAMA")			("AK" = "ALASKA")		("AZ" = "ARIZONA")		("AR" = "ARKANSAS") 		///
						("CA" = "CALIFORNIA") 		("CO" = "COLORADO") 	("CT" = "CONNECTICUT") 	("DE" = "DELAWARE") 		///
						("FL" = "FLORIDA") 			("GA" = "GEORGIA") 		("HI" = "HAWAII")		("ID" = "IDAHO") 			///
						("IL" = "ILLINOIS") 		("IN" = "INDIANA") 		("IA" = "IOWA") 		("KS" = "KANSAS") 			///
						("KY" = "KENTUCKY") 		("LA" = "LOUISIANA") 	("ME" = "MAINE") 		("MD" = "MARYLAND") 		///
						("MA" = "MASSACHUSETTS") 	("MI" = "MICHIGAN") 	("MN" = "MINNESOTA") 	("MS" = "MISSISSIPPI") 		///
						("MO" = "MISSOURI") 		("MT" = "MONTANA") 		("NE" = "NEBRASKA") 	("NV" = "NEVADA") 			///
						("NH" = "NEW HAMPSHIRE") 	("NJ" = "NEW JERSEY") 	("NM" = "NEW MEXICO") 	("NY" = "NEW YORK") 		///
						("NC" = "NORTH CAROLINA") 	("ND" = "NORTH DAKOTA") ("OH" = "OHIO") 		("OK" = "OKLAHOMA") 		///
						("OR" = "OREGON") 			("PA" = "PENNSYLVANIA") ("RI" = "RHODE ISLAND") ("SC" = "SOUTH CAROLINA")	///
						("SD" = "SOUTH DAKOTA") 	("TN" = "TENNESSEE") 	("TX" = "TEXAS") 		("UT" = "UTAH") 			///
						("VT" = "VERMONT") 			("VA" = "VIRGINIA") 	("WA" = "WASHINGTON") 	("WV" = "WEST VIRGINIA")	///
						("WI" = "WISCONSIN") 		("WY" = "WYOMING")		("DC" = "DISTRICT OF COLUMBIA"), 					///
						gen(STATE)
	
	* city name id
	gen idm = _n
							
	ren city name
	
	replace name = subinstr(name, "ST.", "SAINT", .)
	replace name = subinstr(name, "NORTH", "", .)
	replace name = subinstr(name, "SOUTH", "", .)
	replace name = subinstr(name, "WEST", "", .)
	replace name = subinstr(name, "EAST", "", .)
	
	replace name = strtrim(name)
	replace name = stritrim(name)
	
	tempfile maindata
	save `maindata'
	
	* match city names with the geonames.org list by state
	qui foreach s of local statelvl {
		preserve
		keep if state=="`s'"
		
		matchit idm name using `citynames_`s'', idu(id) txtu(name) simi(soundex) gen(score) override
	
		tempfile matches_`s'
		save `matches_`s''
		restore
	}
	
	clear
	
	* append all single state files
	qui foreach s of local statelvl {
		append using `matches_`s''
	}
	
	* assess match quality using bi-gram
	matchit name name1, gen(score_bi)
	
	* mark best match
	egen double maxscore = max(score_bi), by(idm)
	keep if score_bi==maxscore
	
	* merge back main data to city list
	merge m:1 idm using `maindata'
	drop if _merge==2
	drop _merge
	
	* place info from geonames.org
	merge m:1 id using "$path/Replication/raw_data/geonamesorg_places_1910.dta"
	drop if _merge==2
	drop _merge
	
	* county info from Census
	ren icpsrfip fips
	merge m:1 fips using "$path/Replication/raw_data/1910census_county.dta", keepusing(totpop)
	drop if _merge==2
	drop _merge
	
	drop if name==""
	
	* for duplicates, take the place in the county with the highest pop
	egen maxpop = max(totpop), by(name state)
	
	keep if totpop==maxpop
	duplicates drop name state, force
	
	* keep relevant variables
	keep id idm name state STATE icpsrst icpsrcty icpsrnam statenam icpsrsti icpsrctyi fips gisjoin
	
	ren name city
	
	* save the geolocated city list as temp file
	compress
	tempfile citylist
	save `citylist'
	clear
	
	
	// bring back the soldier data				
	use `soldiers'
	
	* merge with geolocated city list
	merge m:1 city state using `citylist'
	drop if _merge==2
	
	drop STATE
	
	* code the statenames again
	qui strrec state 	("AL" = "ALABAMA")			("AK" = "ALASKA")		("AZ" = "ARIZONA")		("AR" = "ARKANSAS") 		///
						("CA" = "CALIFORNIA") 		("CO" = "COLORADO") 	("CT" = "CONNECTICUT") 	("DE" = "DELAWARE") 		///
						("FL" = "FLORIDA") 			("GA" = "GEORGIA") 		("HI" = "HAWAII")		("ID" = "IDAHO") 			///
						("IL" = "ILLINOIS") 		("IN" = "INDIANA") 		("IA" = "IOWA") 		("KS" = "KANSAS") 			///
						("KY" = "KENTUCKY") 		("LA" = "LOUISIANA") 	("ME" = "MAINE") 		("MD" = "MARYLAND") 		///
						("MA" = "MASSACHUSETTS") 	("MI" = "MICHIGAN") 	("MN" = "MINNESOTA") 	("MS" = "MISSISSIPPI") 		///
						("MO" = "MISSOURI") 		("MT" = "MONTANA") 		("NE" = "NEBRASKA") 	("NV" = "NEVADA") 			///
						("NH" = "NEW HAMPSHIRE") 	("NJ" = "NEW JERSEY") 	("NM" = "NEW MEXICO") 	("NY" = "NEW YORK") 		///
						("NC" = "NORTH CAROLINA") 	("ND" = "NORTH DAKOTA") ("OH" = "OHIO") 		("OK" = "OKLAHOMA") 		///
						("OR" = "OREGON") 			("PA" = "PENNSYLVANIA") ("RI" = "RHODE ISLAND") ("SC" = "SOUTH CAROLINA")	///
						("SD" = "SOUTH DAKOTA") 	("TN" = "TENNESSEE") 	("TX" = "TEXAS") 		("UT" = "UTAH") 			///
						("VT" = "VERMONT") 			("VA" = "VIRGINIA") 	("WA" = "WASHINGTON") 	("WV" = "WEST VIRGINIA")	///
						("WI" = "WISCONSIN") 		("WY" = "WYOMING")		("DC" = "DISTRICT OF COLUMBIA"), 					///
						gen(STATE)
	
		
	preserve

	* temporarily save the current data, then look for additional matches for cities that were not in the citylist file
	tempfile tempmatch
	save `tempmatch'
	restore

	
	* look at unmatched cities
	keep if _merge==1
	drop idm- gisjoin
	
	replace city = subinstr(city, "ST ", "SAINT ", .) if word(city, 1)=="ST"
		
	keep if _merge==1
	gen n = 1
	
	egen sumn = sum(n), by(city state)
	duplicates drop city state, force
	
	* list the most frequently appearing unmatched cities
	gsort -sumn city state
	ren _merge merge1
	
	gen idm = _n

	* use NHGIS city list as additional source to geolocated the unmatched cities
	reclink2 city STATE using "$path/Replication/raw_data/placenames1910.dta", idm(idm) idu(_ID) required(STATE) gen(score)
	
	drop if city==""
	
	* compute bi-gram score to check match quality between WW1 city and the matched NHGIS city
	matchit city Ucity, gen(score_bi)
	
	* keep best match
	egen double maxscore = max(score_bi), by(idm)
	keep if score_bi==maxscore
	drop if score_bi<0.6
	
	duplicates drop city STATE, force
	
	keep if _merge==3
	drop _merge

	* save the previously unmatched cities that are now geocoded
	tempfile placematch
	save `placematch'
	clear

	
	* and merge it back to our previous soldier data
	use `tempmatch'
	ren _merge merge1
	
	merge m:1 city STATE using `placematch'
	drop if _merge==2
	ren _merge merge2
	
	gen placefips = substr(NHGISPLACE, 5, 5)
	
	merge m:1 placefips state using "$path/Replication/raw_data/county_place_ids.dta"
	drop if _merge==2
	drop _merge
	
	gen fips2 = statefips + countyfips
	destring fips2, replace
	
	replace fips = fips2 if fips==. & fips2!=.
	
	keep serial- volume fips
	
	
	* some manual fixes (I hand-coded places with >3 soldiers)
	
		// West Hoboken (merged with Union Hill into Union City) and North Bergen in Hudson County, NJ 
		replace fips = 34017 if city=="WEST HOBOKEN" & fips==. & state=="NJ"
		replace fips = 34017 if city=="NORTH BERGEN" & fips==. & state=="NJ"
		
		// South and West Orange, and Peabody in Essex County, NJ
		replace fips = 34017 if city=="WEST ORANGE" & fips==. & state=="NJ"
		replace fips = 34017 if city=="SOUTH ORANGE" & fips==. & state=="NJ"
		replace fips = 34017 if city=="PEAHODY" & fips==. & state=="NJ"
		replace city = "PEABODY" if city=="PEAHODY"
		
		// Westerly in Washington County, RI
		replace fips = 44009 if city=="WESTERLY" & fips==. & state=="RI"
	
		// Westfield in Hampden County, MA
		replace fips = 25013 if city=="WESTFIELD" & fips==. & state=="MA"
		
		// Manchester in Hartford County, CT
		replace fips = 9003 if city=="SOUTH MANCHESTER" & fips==. & state=="CT"
		
		// St Albans in Franklin County, VT
		replace fips = 50011 if city=="ST ALBANS" & fips==. & state=="VT"
		
		// Southbridge in Worcester County, MA
		replace fips = 25027 if city=="SOUTHBRIDGE" & fips==. & state=="MA"
		
		// North Yakima in Yakima County, WA
		replace fips = 53077 if city=="NORTH YAKIMA" & fips==. & state=="WA"
		
		// St Joseph in Berrien County, MI
		replace fips = 26021 if city=="ST JOSEPH" & fips==. & state=="MI"
		
		// North Attleboro in Bristol County, MA
		replace fips = 25005 if city=="NORTH ATTLEBORO" & fips==. & state=="MA"
		
		// Troy in Rensselaer County, NY
		replace fips = 36083 if city=="NORTH TROY" & fips==. & state=="NY"
		
		// South Norwalk in Fairfield County, CT
		replace fips = 9001 if city=="SOUTH NORWALK" & fips==. & state=="CT"
		
		// West Roxbury in Suffolk County, MA
		replace fips = 25025 if city=="WEST ROXBURY" & fips==. & state=="MA"
		
		// Dryden in Harmon County, OK
		replace fips = 40057 if city=="DRYDEN" & fips==. & state=="OK"
		
		// East Greenwich in Kent County, RI
		replace fips = 44003 if city=="EAST GREENWICH" & fips==. & state=="RI"
		
		// Fargo in Cass County, ND
		replace fips = 38017 if city=="NORTH FARGO" & fips==. & state=="ND"
		
		// South Haven in Allegan County, MI
		replace fips = 26005 if city=="SOUTH HAVEN" & fips==. & state=="MI"
		
		// West Bridgewater in Plymouth County, MA
		replace fips = 25023 if city=="WEST BRIDGEWATER" & fips==. & state=="MA"
		
		// Bryan in Brazos County, TX
		replace fips = 48041 if city=="BRVAN" & fips==. & state=="TX"
		replace city = "BRYAN" if city=="BRVAN" & state=="TX"
		
		// Akron in Summit County, OH
		replace fips = 39153 if city=="EAST AKRON" & fips==. & state=="OH"
		
		// East Milton and South Weymouth in Norfolk County, MA 
		replace fips = 25021 if city=="EAST MILTON" & fips==. & state=="MA"
		replace fips = 25021 if city=="SOUTH WEYMOUTH" & fips==. & state=="MA"
		
		// Easthampton in Hampshire County, MA 
		replace fips = 25015 if city=="EASTHAMPTON" & fips==. & state=="MA"
		
		// North English in Iowa County, IA
		replace fips = 19095 if city=="NORTH ENGLISH" & fips==. & state=="IA"
		
		// Northfild in Dakota County, MN
		replace fips = 27037 if city=="NORTHFIELD" & fips==. & state=="MN"
		
		// Northville in Oakland County, MI
		replace fips = 26125 if city=="NORTHVILLE" & fips==. & state=="MI"
		
		// Westside in Crawford County, IA
		replace fips = 19047 if city=="WEST SIDE" & fips==. & state=="IA"
		
		
	
	tostring fips, gen(fipstemp)
	
	gen statefips = substr(fipstemp, 1, 2)
	gen countyfips = substr(fipstemp, 3, 3)
	drop fipstemp
		
		
*******************************************************************************************************************************************
// 3) Clean names
*******************************************************************************************************************************************
		
	// clean first (and middle) name
	
	replace name = upper(name)
	replace surname = upper(surname)
	
	qui foreach v in 33 34 38 39 40 41 42 44 45 46 47 48 49 51 54 57 58 59 62 63 92 93 94 95 125 	///
				126 128 129 130 131 137 139 141 146 147 148 154 157 158 159 161 162 163 165 167 	///
				168 171 173 175 176 187 191 194 195 208 209 226 {
				
				replace name = subinstr(name, `"`=char(`v')'"', "", .)
				replace surname = subinstr(surname, `"`=char(`v')'"', "", .)
	}
	
	replace name = subinstr(name, "[RIGHT", "", .)
	
	replace name = strtrim(name)
	replace name = stritrim(name)
	
	* replace some common name abbreviations with the actual name
	qui foreach i in 1 2 {
		replace name = subinstr(name, "ABR"	    , "ABRAHAM"		, .) if word(name, `i')=="ABR"	 
		replace name = subinstr(name, "ALEX"    , "ALEXANDER"	, .) if word(name, `i')=="ALEX"   
		replace name = subinstr(name, "ALEX"    , "ALEXANDER"	, .) if word(name, `i')=="ALEX"   
		replace name = subinstr(name, "AMB"     , "AMBROSE"	 	, .) if word(name, `i')=="AMB"    
		replace name = subinstr(name, "AND"     , "ANDREW"	 	, .) if word(name, `i')=="AND"    
		replace name = subinstr(name, "ANT"     , "ANTHONY"	 	, .) if word(name, `i')=="ANT"    
		replace name = subinstr(name, "ART"     , "ARTHUR"	 	, .) if word(name, `i')=="ART"    
		replace name = subinstr(name, "AUG"     , "AUGUSTUS"	, .) if word(name, `i')=="AUG"    
		replace name = subinstr(name, "BART"    , "BARTHOLOMEW"	, .) if word(name, `i')=="BART"   
		replace name = subinstr(name, "BENJ"    , "BENJAMIN"	, .) if word(name, `i')=="BENJ"   
		replace name = subinstr(name, "CHAS"    , "CHARLES"		, .) if word(name, `i')=="CHAS"   
		replace name = subinstr(name, "CHR"     , "CHRISTIAN"	, .) if word(name, `i')=="CHR"    
		replace name = subinstr(name, "CLEM"    , "CLEMENT"		, .) if word(name, `i')=="CLEM"   
		replace name = subinstr(name, "CORN"    , "CORNELIUS"	, .) if word(name, `i')=="CORN"   
		replace name = subinstr(name, "DANL"    , "DANIEL"		, .) if word(name, `i')=="DANL"   
		replace name = subinstr(name, "DAV"     , "DAVID"		, .) if word(name, `i')=="DAV"    
		replace name = subinstr(name, "DEN"     , "DENNIS"		, .) if word(name, `i')=="DEN"    
		replace name = subinstr(name, "DOUG"    , "DOUGLAS"		, .) if word(name, `i')=="DOUG"   
		replace name = subinstr(name, "EDM"     , "EDMUND"		, .) if word(name, `i')=="EDM"    
		replace name = subinstr(name, "EDR"     , "EDWARD"		, .) if word(name, `i')=="EDR"    
		replace name = subinstr(name, "EDW"     , "EDWARD"		, .) if word(name, `i')=="EDW"    
		replace name = subinstr(name, "EZEK"    , "EZEKIEL"		, .) if word(name, `i')=="EZEK"   
		replace name = subinstr(name, "FRED"    , "FREDERICK"	, .) if word(name, `i')=="FRED"   
		replace name = subinstr(name, "FROO"    , "FRANCO"		, .) if word(name, `i')=="FROO"   
		replace name = subinstr(name, "FS"      , "FRANCIS"		, .) if word(name, `i')=="FS"     
		replace name = subinstr(name, "GEO"     , "GEORGE"		, .) if word(name, `i')=="GEO"    
		replace name = subinstr(name, "GEOF"    , "GEOFFREY"	, .) if word(name, `i')=="GEOF"   
		replace name = subinstr(name, "GODF"    , "GODFREY"		, .) if word(name, `i')=="GODF"   
		replace name = subinstr(name, "GREG"    , "GREGORY"		, .) if word(name, `i')=="GREG"   
		replace name = subinstr(name, "GUL"     , "WILLIAM"		, .) if word(name, `i')=="GUL"    
		replace name = subinstr(name, "HEN"     , "HENRY"		, .) if word(name, `i')=="HEN"    
		replace name = subinstr(name, "HERB"    , "HERBERT"		, .) if word(name, `i')=="HERB"   
		replace name = subinstr(name, "HY"      , "HENRY"		, .) if word(name, `i')=="HY"     
		replace name = subinstr(name, "IOH"     , "JOHN"		, .) if word(name, `i')=="IOH"    
		replace name = subinstr(name, "IS"      , "ISAAC"		, .) if word(name, `i')=="IS"     
		replace name = subinstr(name, "JAB"     , "JAMES"		, .) if word(name, `i')=="JAB"    
		replace name = subinstr(name, "JAC"     , "JAMES"		, .) if word(name, `i')=="JAC"    
		replace name = subinstr(name, "JAS"     , "JAMES"		, .) if word(name, `i')=="JAS"    
		replace name = subinstr(name, "JER"     , "JEREMIAH"	, .) if word(name, `i')=="JER"    
		replace name = subinstr(name, "JNO"     , "JOHN"		, .) if word(name, `i')=="JNO"    
		replace name = subinstr(name, "JON"     , "JONATHAN"	, .) if word(name, `i')=="JON"    
		replace name = subinstr(name, "JOS"     , "JOSEPH"		, .) if word(name, `i')=="JOS"    
		replace name = subinstr(name, "JOSH"    , "JOSHUA"		, .) if word(name, `i')=="JOSH"   
		replace name = subinstr(name, "JOSH"    , "JOSIAH"		, .) if word(name, `i')=="JOSH"   
		replace name = subinstr(name, "LAU"     , "LAURENCE"	, .) if word(name, `i')=="LAU"    
		replace name = subinstr(name, "LAWR"    , "LAWRENCE"	, .) if word(name, `i')=="LAWR"   
		replace name = subinstr(name, "LEON"    , "LEONARD"		, .) if word(name, `i')=="LEON"   
		replace name = subinstr(name, "MATH"    , "MATTHIAS"	, .) if word(name, `i')=="MATH"   
		replace name = subinstr(name, "MATT"    , "MATTHEW"		, .) if word(name, `i')=="MATT"   
		replace name = subinstr(name, "MAU"     , "MAURICE"		, .) if word(name, `i')=="MAU"    
		replace name = subinstr(name, "MICH"    , "MICHAEL"		, .) if word(name, `i')=="MICH"   
		replace name = subinstr(name, "MICLS"   , "MICHAEL"		, .) if word(name, `i')=="MICLS"  
		replace name = subinstr(name, "MIX"     , "MICHAEL"		, .) if word(name, `i')=="MIX"    
		replace name = subinstr(name, "NATH"    , "NATHANIEL"	, .) if word(name, `i')=="NATH"   
		replace name = subinstr(name, "NICH"    , "NICHOLAS"	, .) if word(name, `i')=="NICH"   
		replace name = subinstr(name, "NICS"    , "NICHOLAS"	, .) if word(name, `i')=="NICS"   
		replace name = subinstr(name, "OL"      , "OLIVER"		, .) if word(name, `i')=="OL"     
		replace name = subinstr(name, "PAT"     , "PATRICK"		, .) if word(name, `i')=="PAT"    
		replace name = subinstr(name, "PET"     , "PETER"		, .) if word(name, `i')=="PET"    
		replace name = subinstr(name, "PHIL"    , "PHILIP"		, .) if word(name, `i')=="PHIL"   
		replace name = subinstr(name, "PHIN"    , "PHINEAS"		, .) if word(name, `i')=="PHIN"   
		replace name = subinstr(name, "RAY"     , "RAYMOND"		, .) if word(name, `i')=="RAY"    
		replace name = subinstr(name, "REG"     , "REGINALD"	, .) if word(name, `i')=="REG"    
		replace name = subinstr(name, "RIC"     , "RICHARD"		, .) if word(name, `i')=="RIC"    
		replace name = subinstr(name, "RICHD"   , "RICHARD"		, .) if word(name, `i')=="RICHD"  
		replace name = subinstr(name, "ROBT"    , "ROBERT"		, .) if word(name, `i')=="ROBT"   
		replace name = subinstr(name, "ROG"     , "ROGER"		, .) if word(name, `i')=="ROG"    
		replace name = subinstr(name, "SAML"    , "SAMUEL"		, .) if word(name, `i')=="SAML"   
		replace name = subinstr(name, "SILV"    , "SYLVESTER"	, .) if word(name, `i')=="SILV"   
		replace name = subinstr(name, "SIM"     , "SIMON"		, .) if word(name, `i')=="SIM"    
		replace name = subinstr(name, "SOL"     , "SOLOMON"		, .) if word(name, `i')=="SOL"    
		replace name = subinstr(name, "STE"     , "STEPHEN"		, .) if word(name, `i')=="STE"    
		replace name = subinstr(name, "THEO"    , "THEODORE"	, .) if word(name, `i')=="THEO"   
		replace name = subinstr(name, "THOS"    , "THOMAS"		, .) if word(name, `i')=="THOS"   
		replace name = subinstr(name, "TIM"     , "TIMOTHY"		, .) if word(name, `i')=="TIM"    
		replace name = subinstr(name, "VAL"     , "VALENTINE"	, .) if word(name, `i')=="VAL"    
		replace name = subinstr(name, "VINC"    , "VINCENT"		, .) if word(name, `i')=="VINC"   
		replace name = subinstr(name, "WALT"    , "WALTER"		, .) if word(name, `i')=="WALT"   
		replace name = subinstr(name, "WIN"     , "WINIFRED"	, .) if word(name, `i')=="WIN"    
		replace name = subinstr(name, "WM"      , "WILLIAM"		, .) if word(name, `i')=="WM"     
		replace name = subinstr(name, "XPR"     , "CHRISTOPHER"	, .) if word(name, `i')=="XPR"    
		replace name = subinstr(name, "XTIAN"   , "CHRISTIAN"	, .) if word(name, `i')=="XTIAN"  
		replace name = subinstr(name, "XTOPHER" , "CHRISTOPHER"	, .) if word(name, `i')=="XTOPHER"
		replace name = subinstr(name, "ZACH"    , "ZACHARIAH"	, .) if word(name, `i')=="ZACH"   
	}

	
	// save individual level data
	compress
	save "$path/Replication/cleaned_data/WWI casualties Army Navy clean.dta", replace
	
	// save county level data
	gen n = 1

	egen dead_all = sum(n), by(fips)
	egen dead_acc = sum(n) if status=="Accident", by(fips)
	egen dead_dis = sum(n) if status=="Disease", by(fips)
	egen dead_kia = sum(n) if status=="KIA", by(fips)
	egen dead_wnd = sum(n) if status=="Wounds", by(fips)
	
	collapse dead_*, by(fips)
	
	drop if fips==0
	mvencode _all, mv(0) over
	
	compress
	save "$path/Replication/cleaned_data/WWI casualties Army Navy clean - county.dta", replace
